5300project_q8

library(readxl)
data <- read_excel("./311_Cases_cleaned.xlsx")
New names:
• `` -> `...1`
str(data)
tibble [519,919 × 27] (S3: tbl_df/tbl/data.frame)
 $ ...1           : num [1:519919] 1 2 3 4 5 6 7 8 9 10 ...
 $ neighborhood   : chr [1:519919] "Bayview" "Bayview" "Bayview" "Bayview" ...
 $ year           : num [1:519919] 2012 2012 2012 2012 2012 ...
 $ mobile.dumm    : num [1:519919] 1 0 1 0 0 0 0 0 0 1 ...
 $ case           : chr [1:519919] "Street and Sidewalk Cleaning" "Street and Sidewalk Cleaning" "Street and Sidewalk Cleaning" "Street and Sidewalk Cleaning" ...
 $ month          : chr [1:519919] "September" "December" "August" "October" ...
 $ request        : chr [1:519919] "General Cleaning" "Bulky Items" "General Cleaning" "General Cleaning" ...
 $ notes          : chr [1:519919] "Case Completed - resolved:" "Case Completed - resolved:" "Case Completed - resolved:" "Case Completed - resolved:" ...
 $ photo.dumm     : num [1:519919] 0 0 0 0 0 0 0 0 0 0 ...
 $ income         : num [1:519919] 56718 56718 56718 56718 56718 ...
 $ population     : num [1:519919] 23467 23467 23467 23467 23467 ...
 $ num_cases      : num [1:519919] 2092 2092 2092 2092 2092 ...
 $ white          : num [1:519919] 0.2 0.2 0.2 0.2 0.2 ...
 $ black          : num [1:519919] 0.294 0.294 0.294 0.294 0.294 ...
 $ asian          : num [1:519919] 0.352 0.352 0.352 0.352 0.352 ...
 $ resolution_time: chr [1:519919] "1.45638888888889" "115.224722222222" "121.911111111111" "1.32083333333333" ...
 $ agency         : chr [1:519919] "DPW Ops Queue" "DPW Ops Queue" "DPW Ops Queue" "DPW Ops Queue" ...
 $ resolution_rate: num [1:519919] 1 1 1 1 0 1 1 0 0 1 ...
 $ CaseID         : num [1:519919] 1403458 1744682 1300261 1616379 1094069 ...
 $ latitude       : num [1:519919] 37.7 37.7 37.7 37.7 37.7 ...
 $ longitude      : num [1:519919] -122 -122 -122 -122 -122 ...
 $ date           : chr [1:519919] "09/10/2012 06:50:38 AM" "12/01/2012 03:02:55 PM" "08/18/2012 07:20:42 PM" "10/30/2012 08:57:12 AM" ...
 $ case_29        : chr [1:519919] "Street and Sidewalk Cleaning" "Street and Sidewalk Cleaning" "Street and Sidewalk Cleaning" "Street and Sidewalk Cleaning" ...
 $ registered     : num [1:519919] 19566 19566 19566 19566 19566 ...
 $ turnout        : num [1:519919] 61.2 61.2 61.2 61.2 61.2 ...
 $ turnout_one_lag: num [1:519919] 47.2 47.2 47.2 47.2 47.2 ...
 $ turnout_two_lag: num [1:519919] 69.7 69.7 69.7 69.7 69.7 ...
summary(data)
      ...1        neighborhood            year       mobile.dumm    
 Min.   :     1   Length:519919      Min.   :2012   Min.   :0.0000  
 1st Qu.:129981   Class :character   1st Qu.:2016   1st Qu.:0.0000  
 Median :259960   Mode  :character   Median :2016   Median :1.0000  
 Mean   :259960                      Mean   :2016   Mean   :0.5532  
 3rd Qu.:389940                      3rd Qu.:2018   3rd Qu.:1.0000  
 Max.   :519919                      Max.   :2018   Max.   :1.0000  
     case              month             request             notes          
 Length:519919      Length:519919      Length:519919      Length:519919     
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
   photo.dumm         income         population      num_cases    
 Min.   :0.0000   Min.   : 20323   Min.   : 2204   Min.   :  295  
 1st Qu.:0.0000   1st Qu.: 70142   1st Qu.:18524   1st Qu.: 5676  
 Median :0.0000   Median : 85873   Median :22638   Median :10683  
 Mean   :0.4453   Mean   : 90201   Mean   :31521   Mean   :22049  
 3rd Qu.:1.0000   3rd Qu.:113392   3rd Qu.:55721   3rd Qu.:44735  
 Max.   :1.0000   Max.   :181125   Max.   :59639   Max.   :58946  
     white            black             asian         resolution_time   
 Min.   :0.1129   Min.   :0.00000   Min.   :0.08266   Length:519919     
 1st Qu.:0.3624   1st Qu.:0.02526   1st Qu.:0.14214   Class :character  
 Median :0.5301   Median :0.03199   Median :0.33398   Mode  :character  
 Mean   :0.4659   Mean   :0.06133   Mean   :0.31400                     
 3rd Qu.:0.5750   3rd Qu.:0.10684   3rd Qu.:0.41885                     
 Max.   :0.8617   Max.   :0.29837   Max.   :0.83816                     
    agency          resolution_rate      CaseID            latitude    
 Length:519919      Min.   :0.0000   Min.   : 1076093   Min.   : 0.00  
 Class :character   1st Qu.:0.0000   1st Qu.: 5424082   1st Qu.:37.75  
 Mode  :character   Median :1.0000   Median : 6390464   Median :37.77  
                    Mean   :0.7179   Mean   : 6643873   Mean   :37.76  
                    3rd Qu.:1.0000   3rd Qu.: 9094490   3rd Qu.:37.78  
                    Max.   :1.0000   Max.   :10300841   Max.   :37.81  
   longitude          date             case_29            registered   
 Min.   :-141.2   Length:519919      Length:519919      Min.   : 7325  
 1st Qu.:-122.4   Class :character   Class :character   1st Qu.:18222  
 Median :-122.4   Mode  :character   Mode  :character   Median :24674  
 Mean   :-122.4                                         Mean   :25032  
 3rd Qu.:-122.4                                         3rd Qu.:32602  
 Max.   :   0.0                                         Max.   :41607  
    turnout      turnout_one_lag turnout_two_lag
 Min.   :43.05   Min.   :43.05   Min.   :43.05  
 1st Qu.:66.66   1st Qu.:53.52   1st Qu.:53.52  
 Median :74.83   Median :72.19   Median :61.23  
 Mean   :71.32   Mean   :67.80   Mean   :62.82  
 3rd Qu.:79.75   3rd Qu.:80.83   3rd Qu.:72.19  
 Max.   :89.07   Max.   :89.07   Max.   :88.98  
colSums(is.na(data))
           ...1    neighborhood            year     mobile.dumm            case 
              0               0               0               0               0 
          month         request           notes      photo.dumm          income 
              0               0             185               0               0 
     population       num_cases           white           black           asian 
              0               0               0               0               0 
resolution_time          agency resolution_rate          CaseID        latitude 
              0               0               0               0               0 
      longitude            date         case_29      registered         turnout 
              0               0               0               0               0 
turnout_one_lag turnout_two_lag 
              0               0 
data$date <- as.Date(data$date, format="%m/%d/%Y %H:%M:%S %p")
data$case <- as.factor(data$case)
data$month <- as.factor(data$month)
data$year <- as.numeric(as.character(data$year))
mean_resolution_rate <- mean(data$resolution_rate, na.rm = TRUE)
print(paste("Average Resolution Rate:", mean_resolution_rate))
[1] "Average Resolution Rate: 0.71793875584466"
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)

Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':

    last_plot
The following object is masked from 'package:stats':

    filter
The following object is masked from 'package:graphics':

    layout
case_type_rates <- data %>%
  group_by(case) %>%
  summarise(Average_Resolution_Rate = mean(resolution_rate, na.rm = TRUE))

g <- ggplot(case_type_rates, aes(x = reorder(case, -Average_Resolution_Rate), y = Average_Resolution_Rate, fill = case)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1, size = 3)) +
  labs(title = "Average Resolution Rate by Case Type", x = "Case Type", y = "Average Resolution Rate")

ggplotly(g)
data$date <- as.Date(data$date, format="%Y-%m-%d")

yearly_rates <- data %>%
  mutate(Year = format(date, "%Y")) %>%
  group_by(Year) %>%
  summarise(Average_Resolution_Rate = mean(resolution_rate, na.rm = TRUE))

ggplot(data = yearly_rates, aes(x = Year, y = Average_Resolution_Rate, group = 1)) + 
  geom_line() +
  geom_point() + 
  labs(title = "Trend of Resolution Rate Over Years", x = "Year", y = "Average Resolution Rate")

monthly_rates <- data %>%
  mutate(Month = format(date, "%Y-%m")) %>%
  group_by(Month) %>%
  summarise(Average_Resolution_Rate = mean(resolution_rate, na.rm = TRUE))

#monthly_rates$Month <- as.Date(as.character(monthly_rates$Month), format="%Y-%m")



ggplot(monthly_rates, aes(x = Month, y = Average_Resolution_Rate, group = 1)) +
  geom_line() + 
  geom_point() +
  labs(title = "Trend of Resolution Rate Over Months", x = "Month", y = "Average Resolution Rate") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(case_type_rates, aes(x = reorder(case, -Average_Resolution_Rate), y = Average_Resolution_Rate)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Average Resolution Rate by Case Type", x = "Case Type", y = "Average Resolution Rate") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

ggplot(data, aes(x = case, y = resolution_rate)) +
  geom_boxplot() +
  labs(title = "Distribution of Resolution Rates by Case Type", x = "Case Type", y = "Resolution Rate") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

library(forecast)
Warning: package 'forecast' was built under R version 4.3.3
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
library(lubridate)

Attaching package: 'lubridate'
The following objects are masked from 'package:base':

    date, intersect, setdiff, union
monthly_rates$Month <- as.Date(paste0(monthly_rates$Month, "-01"), format="%Y-%m-%d")

resolution_ts <- ts(monthly_rates$Average_Resolution_Rate, 
                    start = c(year(min(monthly_rates$Month)), 
                              month(min(monthly_rates$Month))), 
                    frequency = 12)


fit_arima <- auto.arima(resolution_ts)

summary(fit_arima)
Series: resolution_ts 
ARIMA(2,0,0) with non-zero mean 

Coefficients:
         ar1      ar2    mean
      1.0717  -0.4595  0.7221
s.e.  0.1380   0.1979  0.0300

sigma^2 = 0.005813:  log likelihood = 54.05
AIC=-100.1   AICc=-99.13   BIC=-92.79

Training set error measures:
                      ME       RMSE        MAE       MPE     MAPE      MASE
Training set 0.002655479 0.07371691 0.04011616 -1.891985 7.329857 0.4699802
                   ACF1
Training set -0.1870509
forecast_arima <- forecast(fit_arima, h=12)

print(forecast_arima)
         Point Forecast     Lo 80     Hi 80     Lo 95     Hi 95
Jan 2016      0.7096100 0.6118980 0.8073220 0.5601724 0.8590476
Feb 2016      0.7222669 0.5790435 0.8654903 0.5032257 0.9413082
Mar 2016      0.7280034 0.5697454 0.8862614 0.4859687 0.9700381
Apr 2016      0.7283356 0.5682626 0.8884086 0.4835251 0.9731462
May 2016      0.7260559 0.5658993 0.8862125 0.4811175 0.9709943
Jun 2016      0.7234601 0.5624466 0.8844736 0.4772113 0.9697089
Jul 2016      0.7217258 0.5599774 0.8834741 0.4743530 0.9690985
Aug 2016      0.7210598 0.5590678 0.8830518 0.4733143 0.9688052
Sep 2016      0.7211429 0.5591325 0.8831534 0.4733694 0.9689165
Oct 2016      0.7215381 0.5595210 0.8835551 0.4737544 0.9693218
Nov 2016      0.7219233 0.5598839 0.8839627 0.4741054 0.9697412
Dec 2016      0.7221546 0.5601001 0.8842091 0.4743137 0.9699955
plot(forecast_arima)

checkresiduals(fit_arima)


    Ljung-Box test

data:  Residuals from ARIMA(2,0,0) with non-zero mean
Q* = 8.9872, df = 7, p-value = 0.2536

Model df: 2.   Total lags used: 9